Visualizing Gene Variant Data¶
Setup¶
import polars as pl
import ipywidgets as widgets
import os
import plotly_express as px
import plotly.io as pio
# Required for `nbconvert` to include Plotly plots in the output HTML
pio.renderers.default = "notebook"
Parameter Definition¶
gene_id = "HLA-A"
Loading the Data¶
data_dir = "../data/processed/"
# Create a dropdown widget to select the gene
gene_options = [
os.path.basename(f).replace("-pop-vars.parquet", "")
for f in os.listdir(data_dir)
if f.endswith("-pop-vars.parquet")
]
print(f"Available genes: {gene_options}")
# Load data
vars_path = os.path.join(data_dir, f"{gene_id}-variants.parquet")
pops_path = os.path.join(data_dir, f"{gene_id}-pop-vars.parquet")
df_vars = pl.read_parquet(vars_path)
df_pops = pl.read_parquet(pops_path)
print(f"Loaded {df_vars.height} variants for gene {gene_id}.")
Available genes: ['HLA-A', 'CYP2D6', 'HLA-B'] Loaded 4182 variants for gene HLA-A.
Variant Position in Gene¶
fig = px.histogram(
df_vars,
x="pos",
nbins=100,
title=f"Variant Position Distribution for {gene_id}",
labels={"pos": "Genomic Position"},
)
fig.update_yaxes(title_text="Number of Variants")
fig.show()
Variant Effects¶
display(
df_vars.get_column("consequence")
.value_counts()
.sort("count", descending=True)
.head(10)
)
display(
df_vars.get_column("consequence")
.value_counts()
.sort("count", descending=True)
.tail(10)
)
| consequence | count |
|---|---|
| str | u32 |
| "intron_variant" | 2681 |
| "non_coding_transcript_exon_var… | 398 |
| "missense_variant" | 262 |
| "regulatory_region_variant" | 192 |
| "3_prime_UTR_variant" | 140 |
| "frameshift_variant" | 95 |
| "intergenic_variant" | 94 |
| "synonymous_variant" | 90 |
| "splice_region_variant" | 69 |
| "splice_polypyrimidine_tract_va… | 39 |
| consequence | count |
|---|---|
| str | u32 |
| "5_prime_UTR_variant" | 14 |
| "inframe_deletion" | 14 |
| "splice_donor_region_variant" | 13 |
| "splice_donor_5th_base_variant" | 10 |
| "splice_donor_variant" | 10 |
| "inframe_insertion" | 8 |
| "TF_binding_site_variant" | 6 |
| "start_lost" | 3 |
| "protein_altering_variant" | 2 |
| "stop_lost" | 1 |
Clinical Significance of Variants¶
display(
df_vars.get_column("clinical_significance")
.value_counts()
.sort("count", descending=True)
.head(10)
)
# Filter variants with clinical significance
df_vars.filter(
pl.col("clinical_significance") != [],
)
| clinical_significance | count |
|---|---|
| list[str] | u32 |
| [] | 4177 |
| ["benign"] | 1 |
| ["uncertain significance"] | 1 |
| ["likely benign"] | 1 |
| ["not provided"] | 1 |
| ["risk factor"] | 1 |
| id | chr | pos | ref | alt | consequence | clinical_significance |
|---|---|---|---|---|---|---|
| str | str | i64 | str | str | str | list[str] |
| "rs1370973378" | "6" | 29941373 | "A" | "G" | "intergenic_variant" | ["uncertain significance"] |
| "rs45585732" | "6" | 29942772 | "G" | "A" | "missense_variant" | ["likely benign"] |
| "rs199474493" | "6" | 29943307 | "G" | "A" | "missense_variant" | ["benign"] |
| "rs1554115495" | "6" | 29944277 | "CC" | "C" | "frameshift_variant" | ["not provided"] |
| "rs1061235" | "6" | 29945521 | "A" | "T" | "3_prime_UTR_variant" | ["risk factor"] |
Minor Allele Frequency (MAF) Distribution¶
The histogram below shows the minor allele frequency (MAF) distribution for the gene of interest, fetched from EMBL. Very rare and very common variants are removed.
# Remove very rare or common variants
df_pops_filtered = df_pops.filter((pl.col("MAF") >= 0.01) & (pl.col("MAF") <= 0.99))
px.histogram(
df_pops_filtered,
x="MAF",
title=f"Allele Counts by Population for {gene_id}",
)
Population Variant Frequency Analysis¶
We can also look at MAF per population. There are many different populations in the EMBL database. For this analysis, we will focus on the gnomAD datasets (gnomADg for whole genome data).
# Filter for gnomAD populations only
df_pops_filtered = df_pops_filtered.filter(pl.col("population").str.contains("gnomADe"))
px.box(
df_pops_filtered,
y="MAF",
x="population",
title=f"MAF Distribution by Population for {gene_id}",
log_y=True,
labels={"MAF": "MAF (log scale)", "population": "Population"},
)
Plotting Variant Frequencies of European and African Populations¶
By plotting the MAF of both these populations against each other, we can identify population-specific variants. In the plot below, variants in the upper left corner are common in African individuals but rare in non-Finnish European, while variants in the lower right corner are rare in Europeans and common in Africans.
df_wide = df_pops.pivot(
values="MAF",
index="id",
on="population",
aggregate_function="first",
)
px.scatter(
df_wide,
x="gnomADe:nfe",
y="gnomADe:afr",
hover_name="id",
labels={
"gnomADe:nfe": "MAF in Non-Finnish European",
"gnomADe:afr": "MAF in African",
},
title=f"MAF in NFE vs AFR Populations for {gene_id}",
)
Heatmap of Variant MAFs Across Populations¶
A heatmap of the minor allele frequencies of each variant across the different population can reveal population structure.
df_wide_filtered = df_pops_filtered.pivot(
index="id",
on="population",
values="MAF",
aggregate_function="first",
)
pdf = df_wide_filtered.to_pandas().set_index("id")
# 7. Plot heatmap
fig = px.imshow(
pdf,
aspect="auto",
color_continuous_scale="Viridis",
labels=dict(x="Population", y="Variant ID", color="MAF"),
title=f"MAF heatmap for {gene_id}",
height=600,
)
fig.update_xaxes(side="top")
fig.update_yaxes(automargin=True, showticklabels=False)
fig.update_layout(
hovermode="closest",
coloraxis_colorbar=dict(title="MAF"),
)
fig.show()